# Extract RMS data of relevant UPC's  
# JHL  

### PRELIMINARIES 
list.of.packages <- c("folderfun", "data.table", "bit64", "lubridate", "foreign")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos = "http://cran.us.r-project.org")

require(folderfun) 
require(data.table)
require(bit64)
require(lubridate)
require(foreign)

# 1-100 
myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

# 06-14 
# Set paths 
# Repository contains RMS data from 06-14 merged to module level in RData format as "move"
setff("rms_move")
repository <- ffrms_move()
setff("path_home")
path_home <- ffpath_home()

do_path <- sprintf("%s/do",path_home)
dta_path <- sprintf("%s/raw/nielsen",path_home)
save_path <- sprintf("%s/dta/nielsen/pranks",path_home)

# Create RData data table of file names and modules
if (!file.exists(sprintf("%s/module_0614_files.RData",do_path))) {
	
	 # Much more efficient listing method using system call
		# Can also sort by size with -S, then run arrays based on size 
	 ptm <- proc.time()
	 setwd(repository)
	 # List files in repository by descending order of size
	 file.table <- system("ls *.RData -S", intern = TRUE)
	 file.table <- lapply(as.list(file.table), function(x) paste(repository,"/",x,sep=""))
	 file.table <- data.table(unlist(file.table))
	 file.table[,id:=.I]

	 names(file.table) <- c("file_name", "id") # Name data table 
	 
	file.table[ ,file:=tstrsplit(file_name,"/",fixed=T)[[length(tstrsplit(file_name,"/",fixed=T))]] ]
	 # Sample: "/project2/databases/nielsen/nielsen_extracts/RMS/Movement_Files_Combined_RFormatted/1040.RData"
	 # Extract only digits: the module
	 file.table[,module:=as.numeric(gsub("[^0-9]","",file))]

	 save(file.table, file = sprintf("%s/module_0614_files.RData",do_path)) # Save to RData for quick loading 		 
	 proc.time() - ptm

}

load(sprintf("%s/module_0614_files.RData",do_path)) 

### 4. Go back to module files, extract all RMS data of relevant UPC's ###
file.table <- file.table[(module==1040|module==1290|module==1303|module==1362|module==1463|module==1484|module==1493|module==3603|module==7080|module==7260|module==7734|module==8404),]

# 1-2 
myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

file.table[,two:=1:2]
file.table <- file.table[two==myID]

# For each module and channel code, extract RMS data for top UPC
for (m in file.table[,module]){
	for(c in c("D","F","M")){
		if(file.exists(sprintf('%s/%s_top1_%s.dta',save_path,m,c))){
			print(sprintf('%s %s Skip',m,c))
			next 
		}
		
		load(file.table[module==m,file_name])

		# Merge with channel code 
		setkey(move,store_code_uc,year)
		stores_0615 <- data.table(read.dta(sprintf("%s/stores_0615.dta",dta_path)))
		setkey(stores_0615,store_code_uc,year)
		move <- move[stores_0615[,.(store_code_uc,year,channel_code)], nomatch = 0]
		
		# Merge with top ranked only
		top1_pranks <- readRDS(sprintf('%s/top1_pranks_%s.rds',save_path,c))
		setkey(move,upc,upc_ver_uc,channel_code) 
		setkey(top1_pranks,upc,upc_ver_uc,channel_code)
		
		top1_pranks[,':='(upc=as.integer64(upc),V1=NULL)]
		
		move <- move[top1_pranks, nomatch = 0] 
		move <- move[,upc:=as.character(upc)]
		
		# saveRDS(prank,sprintf('%s/%s_prank.rds',save_path,m))
		# write.csv(prank, file = sprintf('%s/%s_prank.csv',save_path,m))
		write.dta(move, file = sprintf('%s/%s_top1_%s.dta',save_path,m,c))
		rm(move)
		print(sprintf('%s %s Done',m,c))
		
		gc()
	}
}	

